在 Day 16 我們建立了 WebSocket 即時協作引擎的基礎架構,今天我們要將這個架構擴展為完整的微服務體系,並在 AWS 雲端環境中實現高可用的分散式部署。我們將關注服務間通訊、負載均衡、以及 Redis 叢集的配置。
# infrastructure/services-architecture.yml
services:
  api-gateway:
    type: application-load-balancer
    function: 統一入口點與路由分發
  gym-management-service:
    type: ecs-fargate
    function: 健身房基礎管理功能
    dependencies: [postgres-rds, redis-cluster]
  collaboration-service:
    type: ecs-fargate
    function: WebSocket 即時協作
    dependencies: [redis-cluster, postgres-rds]
  notification-service:
    type: ecs-fargate
    function: 推播與通知管理
    dependencies: [redis-cluster, ses, sns]
  analytics-service:
    type: ecs-fargate
    function: 數據分析與報表
    dependencies: [postgres-rds, s3, cloudwatch]
shared-infrastructure:
  postgres-rds:
    engine: postgresql-15
    deployment: multi-az
  redis-cluster:
    engine: redis-7
    deployment: cluster-mode
    nodes: 3-primary-3-replica
  s3-bucket:
    purpose: static-assets-storage
  cloudfront:
    purpose: cdn-delivery
{
  "family": "collaboration-service",
  "networkMode": "awsvpc",
  "requiresCompatibilities": ["FARGATE"],
  "cpu": "1024",
  "memory": "2048",
  "executionRoleArn": "arn:aws:iam::ACCOUNT:role/ecsTaskExecutionRole",
  "taskRoleArn": "arn:aws:iam::ACCOUNT:role/ecsTaskRole",
  "containerDefinitions": [
    {
      "name": "collaboration-service",
      "image": "ACCOUNT.dkr.ecr.REGION.amazonaws.com/kyo-collaboration:latest",
      "essential": true,
      "portMappings": [
        {
          "containerPort": 3001,
          "protocol": "tcp"
        }
      ],
      "environment": [
        {
          "name": "NODE_ENV",
          "value": "production"
        },
        {
          "name": "REDIS_CLUSTER_ENDPOINT",
          "value": "kyo-redis-cluster.abc123.cache.amazonaws.com:6379"
        }
      ],
      "secrets": [
        {
          "name": "DATABASE_URL",
          "valueFrom": "arn:aws:ssm:REGION:ACCOUNT:parameter/kyo/database-url"
        },
        {
          "name": "JWT_SECRET",
          "valueFrom": "arn:aws:ssm:REGION:ACCOUNT:parameter/kyo/jwt-secret"
        }
      ],
      "logConfiguration": {
        "logDriver": "awslogs",
        "options": {
          "awslogs-group": "/ecs/collaboration-service",
          "awslogs-region": "REGION",
          "awslogs-stream-prefix": "ecs"
        }
      },
      "healthCheck": {
        "command": [
          "CMD-SHELL",
          "curl -f http://localhost:3001/health || exit 1"
        ],
        "interval": 30,
        "timeout": 5,
        "retries": 3,
        "startPeriod": 60
      }
    }
  ]
}
# infrastructure/alb-config.yml
apiVersion: v1
kind: ConfigMap
metadata:
  name: alb-configuration
data:
  target-groups.json: |
    {
      "gym-management": {
        "port": 3000,
        "protocol": "HTTP",
        "health_check": {
          "path": "/api/health",
          "healthy_threshold": 2,
          "unhealthy_threshold": 3,
          "timeout": 5,
          "interval": 30
        }
      },
      "collaboration": {
        "port": 3001,
        "protocol": "HTTP",
        "health_check": {
          "path": "/health",
          "healthy_threshold": 2,
          "unhealthy_threshold": 3,
          "timeout": 5,
          "interval": 30
        }
      }
    }
  listener-rules.json: |
    {
      "rules": [
        {
          "priority": 100,
          "conditions": [
            {
              "field": "path-pattern",
              "values": ["/api/collaboration/*", "/socket.io/*"]
            }
          ],
          "actions": [
            {
              "type": "forward",
              "target_group": "collaboration"
            }
          ]
        },
        {
          "priority": 200,
          "conditions": [
            {
              "field": "path-pattern",
              "values": ["/api/*"]
            }
          ],
          "actions": [
            {
              "type": "forward",
              "target_group": "gym-management"
            }
          ]
        }
      ]
    }
{
  "serviceName": "collaboration-service",
  "cluster": "kyo-production-cluster",
  "taskDefinition": "collaboration-service:latest",
  "desiredCount": 3,
  "launchType": "FARGATE",
  "networkConfiguration": {
    "awsvpcConfiguration": {
      "subnets": [
        "subnet-12345678",
        "subnet-87654321"
      ],
      "securityGroups": [
        "sg-collaboration-service"
      ],
      "assignPublicIp": "DISABLED"
    }
  },
  "loadBalancers": [
    {
      "targetGroupArn": "arn:aws:elasticloadbalancing:REGION:ACCOUNT:targetgroup/collaboration/1234567890123456",
      "containerName": "collaboration-service",
      "containerPort": 3001
    }
  ],
  "serviceRegistries": [
    {
      "registryArn": "arn:aws:servicediscovery:REGION:ACCOUNT:service/srv-collaboration",
      "containerName": "collaboration-service",
      "containerPort": 3001
    }
  ]
}
# infrastructure/redis-cluster.yml
Resources:
  RedisSubnetGroup:
    Type: AWS::ElastiCache::SubnetGroup
    Properties:
      Description: Subnet group for Redis cluster
      SubnetIds:
        - !Ref PrivateSubnet1
        - !Ref PrivateSubnet2
        - !Ref PrivateSubnet3
  RedisCluster:
    Type: AWS::ElastiCache::ReplicationGroup
    Properties:
      ReplicationGroupId: kyo-redis-cluster
      Description: Redis cluster for real-time collaboration
      NodeType: cache.r7g.large
      Engine: redis
      EngineVersion: 7.0
      Port: 6379
      NumCacheClusters: 6
      NumNodeGroups: 3
      ReplicasPerNodeGroup: 1
      CacheSubnetGroupName: !Ref RedisSubnetGroup
      SecurityGroupIds:
        - !Ref RedisSecurityGroup
      AtRestEncryptionEnabled: true
      TransitEncryptionEnabled: true
      AutomaticFailoverEnabled: true
      MultiAZEnabled: true
      PreferredMaintenanceWindow: sun:05:00-sun:06:00
      SnapshotWindow: 03:00-04:00
      SnapshotRetentionLimit: 7
  RedisSecurityGroup:
    Type: AWS::EC2::SecurityGroup
    Properties:
      GroupDescription: Security group for Redis cluster
      VpcId: !Ref VPC
      SecurityGroupIngress:
        - IpProtocol: tcp
          FromPort: 6379
          ToPort: 6379
          SourceSecurityGroupId: !Ref ApplicationSecurityGroup
          Description: Redis access from application services
// infrastructure/redis-config.ts
export interface RedisClusterConfig {
  nodes: Array<{
    host: string;
    port: number;
  }>;
  options: {
    enableReadyCheck: boolean;
    redisOptions: {
      family: number;
      connectTimeout: number;
      commandTimeout: number;
      retryDelayOnFailover: number;
      enableOfflineQueue: boolean;
      maxRetriesPerRequest: number;
    };
    slotsRefreshTimeout: number;
    slotsRefreshInterval: number;
  };
}
export const createRedisClusterConfig = (): RedisClusterConfig => {
  const clusterEndpoint = process.env.REDIS_CLUSTER_ENDPOINT;
  if (!clusterEndpoint) {
    throw new Error('REDIS_CLUSTER_ENDPOINT environment variable is required');
  }
  // AWS ElastiCache 叢集端點格式解析
  const [host, portStr] = clusterEndpoint.split(':');
  const port = parseInt(portStr, 10) || 6379;
  return {
    nodes: [
      { host, port },
      { host: host.replace('.cache.amazonaws.com', '.0001.cache.amazonaws.com'), port },
      { host: host.replace('.cache.amazonaws.com', '.0002.cache.amazonaws.com'), port }
    ],
    options: {
      enableReadyCheck: true,
      redisOptions: {
        family: 4,
        connectTimeout: 5000,
        commandTimeout: 5000,
        retryDelayOnFailover: 100,
        enableOfflineQueue: false,
        maxRetriesPerRequest: 3
      },
      slotsRefreshTimeout: 10000,
      slotsRefreshInterval: 30000
    }
  };
};
# collaboration-service/Dockerfile
FROM node:18-alpine AS builder
WORKDIR /app
# 複製 package files
COPY package*.json ./
COPY pnpm-lock.yaml ./
# 安裝 pnpm 並安裝依賴
RUN npm install -g pnpm
RUN pnpm install --frozen-lockfile
# 複製源碼並構建
COPY . .
RUN pnpm run build
# 生產階段
FROM node:18-alpine AS production
# 建立非 root 用戶
RUN addgroup -g 1001 -S nodejs
RUN adduser -S nodejs -u 1001
WORKDIR /app
# 複製構建結果
COPY --from=builder --chown=nodejs:nodejs /app/dist ./dist
COPY --from=builder --chown=nodejs:nodejs /app/node_modules ./node_modules
COPY --from=builder --chown=nodejs:nodejs /app/package.json ./package.json
# 健康檢查
HEALTHCHECK --interval=30s --timeout=3s --start-period=5s --retries=3 \
  CMD curl -f http://localhost:3001/health || exit 1
# 安全設置
USER nodejs
EXPOSE 3001
CMD ["node", "dist/index.js"]
# .github/workflows/deploy-collaboration-service.yml
name: Deploy Collaboration Service
on:
  push:
    branches: [main]
    paths:
      - 'apps/collaboration-service/**'
      - 'packages/@kyong/kyo-core/**'
env:
  AWS_REGION: ap-northeast-1
  ECR_REPOSITORY: kyo-collaboration
  ECS_SERVICE: collaboration-service
  ECS_CLUSTER: kyo-production-cluster
jobs:
  build-and-deploy:
    runs-on: ubuntu-latest
    steps:
    - name: Checkout
      uses: actions/checkout@v4
    - name: Configure AWS credentials
      uses: aws-actions/configure-aws-credentials@v4
      with:
        aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
        aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
        aws-region: ${{ env.AWS_REGION }}
    - name: Login to Amazon ECR
      id: login-ecr
      uses: aws-actions/amazon-ecr-login@v2
    - name: Build and push image
      id: build-image
      env:
        ECR_REGISTRY: ${{ steps.login-ecr.outputs.registry }}
        IMAGE_TAG: ${{ github.sha }}
      run: |
        cd apps/collaboration-service
        docker build -t $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG .
        docker push $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG
        echo "image=$ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG" >> $GITHUB_OUTPUT
    - name: Update ECS service
      env:
        IMAGE_URI: ${{ steps.build-image.outputs.image }}
      run: |
        # 更新任務定義
        aws ecs describe-task-definition \
          --task-definition $ECS_SERVICE \
          --query taskDefinition > task-def.json
        # 更新映像 URI
        jq --arg IMAGE_URI "$IMAGE_URI" \
          '.containerDefinitions[0].image = $IMAGE_URI' \
          task-def.json > updated-task-def.json
        # 註冊新的任務定義
        aws ecs register-task-definition \
          --cli-input-json file://updated-task-def.json
        # 更新服務
        aws ecs update-service \
          --cluster $ECS_CLUSTER \
          --service $ECS_SERVICE \
          --task-definition $ECS_SERVICE
        # 等待部署完成
        aws ecs wait services-stable \
          --cluster $ECS_CLUSTER \
          --services $ECS_SERVICE
# infrastructure/monitoring.yml
Resources:
  CollaborationServiceAlarms:
    Type: AWS::CloudWatch::CompositeAlarm
    Properties:
      AlarmName: CollaborationService-Health
      AlarmDescription: Composite alarm for collaboration service health
      ActionsEnabled: true
      AlarmActions:
        - !Ref SNSTopicArn
      AlarmRule: !Sub |
        ALARM(${CPUUtilizationAlarm}) OR
        ALARM(${MemoryUtilizationAlarm}) OR
        ALARM(${ErrorRateAlarm}) OR
        ALARM(${ResponseTimeAlarm})
  CPUUtilizationAlarm:
    Type: AWS::CloudWatch::Alarm
    Properties:
      AlarmName: CollaborationService-HighCPU
      AlarmDescription: High CPU utilization for collaboration service
      MetricName: CPUUtilization
      Namespace: AWS/ECS
      Statistic: Average
      Period: 300
      EvaluationPeriods: 2
      Threshold: 80
      ComparisonOperator: GreaterThanThreshold
      Dimensions:
        - Name: ServiceName
          Value: collaboration-service
        - Name: ClusterName
          Value: kyo-production-cluster
  ErrorRateAlarm:
    Type: AWS::CloudWatch::Alarm
    Properties:
      AlarmName: CollaborationService-HighErrorRate
      AlarmDescription: High error rate for collaboration service
      MetricName: HTTPCode_Target_5XX_Count
      Namespace: AWS/ApplicationELB
      Statistic: Sum
      Period: 300
      EvaluationPeriods: 2
      Threshold: 10
      ComparisonOperator: GreaterThanThreshold
      TreatMissingData: notBreaching
// infrastructure/tracing.ts
import { NodeTracerProvider } from '@opentelemetry/sdk-node';
import { Resource } from '@opentelemetry/resources';
import { SemanticResourceAttributes } from '@opentelemetry/semantic-conventions';
import { AWSXRayIdGenerator } from '@opentelemetry/id-generator-aws-xray';
import { AWSXRayPropagator } from '@opentelemetry/propagator-aws-xray';
export const initializeTracing = () => {
  const provider = new NodeTracerProvider({
    resource: new Resource({
      [SemanticResourceAttributes.SERVICE_NAME]: 'collaboration-service',
      [SemanticResourceAttributes.SERVICE_VERSION]: process.env.SERVICE_VERSION || '1.0.0',
      [SemanticResourceAttributes.DEPLOYMENT_ENVIRONMENT]: process.env.NODE_ENV || 'production'
    }),
    idGenerator: new AWSXRayIdGenerator()
  });
  // AWS X-Ray 整合
  provider.register({
    propagator: new AWSXRayPropagator()
  });
  return provider;
};
{
  "Version": "2012-10-17",
  "Statement": [
    {
      "Effect": "Allow",
      "Action": [
        "logs:CreateLogGroup",
        "logs:CreateLogStream",
        "logs:PutLogEvents"
      ],
      "Resource": "arn:aws:logs:*:*:*"
    },
    {
      "Effect": "Allow",
      "Action": [
        "ssm:GetParameter",
        "ssm:GetParameters"
      ],
      "Resource": [
        "arn:aws:ssm:*:*:parameter/kyo/*"
      ]
    },
    {
      "Effect": "Allow",
      "Action": [
        "elasticache:Describe*",
        "elasticache:List*"
      ],
      "Resource": "*"
    },
    {
      "Effect": "Allow",
      "Action": [
        "xray:PutTraceSegments",
        "xray:PutTelemetryRecords"
      ],
      "Resource": "*"
    }
  ]
}
今天我們完成了 WebSocket 協作服務的雲端部署架構設計: